Load Libraries

library(DistributionUtils)
## Loading required package: RUnit
library(scales)
library(ggplot2)
library(reshape)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following object is masked from 'package:reshape':
## 
##     melt
library(IRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, as.vector, cbind, colnames, do.call,
##     duplicated, eval, evalq, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, rank, rbind, rownames,
##     sapply, setdiff, sort, table, tapply, union, unique, unlist,
##     unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## 
## The following object is masked from 'package:reshape':
## 
##     rename
## 
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:data.table':
## 
##     shift
## 
## The following object is masked from 'package:reshape':
## 
##     expand
library(GenomicRanges)
## Loading required package: GenomeInfoDb
wd = "/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/"

Load input files

setwd(wd)

EIC_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo_withHumanFilter/EICv2/analytics_output.txt")
ECS_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo_withHumanFilter/ECS/analytics_output.txt")
CS2_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo_withHumanFilter/CS2/analytics_output.txt")
setwd(wd)

plot_contigs <- function(blastData, length_filt, identity_filt, var_gene_lengths){
#   blastData <-EIC_processed_contigs
#   identity_filt <- 95
#   length_filt <- 500
  #first filter out unmatched contigs and convert types
  blastData <- data.frame(subset(blastData, sequence.id != '-'))
  blastData$alignment.length <-as.numeric(as.character(blastData$alignment.length))
  blastData$s..start <-as.numeric(as.character(blastData$s..start))
  blastData$s..end <-as.numeric(as.character(blastData$s..end))
  blastData$q..start <-as.numeric(as.character(blastData$q..start))
  blastData$q..end <-as.numeric(as.character(blastData$q..end))
  blastData$percent.identity <-as.numeric(as.character(blastData$percent.identity))
  
  #now filter out hits we aren't interested in
  #blastData <- subset(blastData, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
  #blastData <- subset(blastData, alignment.length > 800) #throw away contigs less than 3 reads ~ length residues long
  #blastData <- subset(blastData, percent.identity > identity_filt) #throw away alignments that share less than x%
  
  #now split into seperate data frames for each database
  var_hits <- subset(blastData, grepl("blastHits_Full_var_rask2010*",data.base.name))
  var_hits <- subset(var_hits, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
  var_hits <- subset(var_hits, percent.identity > identity_filt) #throw away alignments that share less than x%
  
  #var_hits <-subset(var_hits, (sequence.id == 'IT4var04') | (sequence.id == 'IT4var08'))
  human_hits <- subset(blastData, grepl("blastHits_human_reference*",data.base.name))
  Pf3D7_hits <- subset(blastData, grepl("blastHits_Pf3D7_reference*",data.base.name))
  Pf3D7_hits <- subset(Pf3D7_hits, alignment.length > 100) #throw away contigs less than 3 reads ~ length residues long
  Pf3D7_hits <- subset(Pf3D7_hits, percent.identity > 85) #throw away alignments that share less than x%
  
  #filter out human and Pf3D7 contigs that aren't matched to any var genes
  var_contigs <- unique(var_hits$contig.id)
  human_hits <- subset(human_hits, contig.id %in% var_contigs)
  Pf3D7_hits <- subset(Pf3D7_hits, contig.id %in% var_contigs)
  
  #now annoying stuff so ggplot doesnt get upset (give each contig a different height?)
  ydf <- data.frame(contig.id=unique(var_hits$contig.id) , y=as.numeric(factor(unique(var_hits$contig.id))))
  var_hits <- merge(var_hits, ydf, by="contig.id")
  human_hits <- merge(human_hits, ydf, by="contig.id")
  Pf3D7_hits <- merge(Pf3D7_hits, ydf, by="contig.id")
  
  
  #Now generate the underlying contigs (full length)
  contig <- var_hits
  contig$s..start <- contig$s..start - contig$q..start
  contig$s..end <- contig$s..start+contig$contig.length
  
   #now change the Pf3D7 sequence to be in terms of the var reference VERY HACKY
  diff <- data.frame(contig.id=contig$contig.id, diff=contig$s..start, diff_seq=contig$sequence.id)
  diff <- subset(diff, !duplicated(contig.id))
  diff <- merge(diff, Pf3D7_hits, by="contig.id")
  
  Pf3D7_hits$s..start <- diff$diff  + Pf3D7_hits$q..start
  Pf3D7_hits$s..end <- diff$diff + Pf3D7_hits$q..end
  Pf3D7_hits$sequence.id <- diff$diff_seq
  
  #get the gene lengths from rask data
  var_gene_lengths <- var_gene_lengths[var_gene_lengths$sequence.id %in% intersect(var_gene_lengths$sequence.id, contig$sequence.id),]
  
  
  
  gg <- ggplot(data=contig, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1, fill=factor(contig.id))) + geom_rect( alpha=0.4)
  gg <- gg + geom_rect(data=var_hits, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1, fill=factor(contig.id), labels = rpk))
  #gg <- gg + geom_rect(data=Pf3D7_hits, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1), color="black", alpha=0.5)
  
  gg <- gg + geom_text(data=var_hits, aes(x=s..start+(s..end-s..start)/2, y=y+(y+1-y)/2, label=paste(contig.id,rpk, sep=" ")), size=8) 
  gg <- gg + geom_vline(data = var_gene_lengths, aes(xintercept = gene.length))
  gg <- gg + facet_wrap(~sequence.id)
  gg <- gg + xlab("location on reference sequence")
  gg <- gg + ylab("") + scale_y_continuous(breaks=NULL)
  gg <- gg + mytheme()
  print(gg)
  #paste(percent.identity,mismatches,gap.opens, sep=" ")
  rpk_counts <- var_hits
  rpk_counts$rpk <- rpk_counts$rpk*(rpk_counts$alignment.length/rpk_counts$contig.length)
  rpk_counts <- rpk_counts[,(names(rpk_counts) %in% c('sequence.id','rpk'))]
  rpk_counts <- aggregate(. ~ sequence.id, data=rpk_counts, sum)
  
  names(rpk_counts) <- c("var.name", "contig.count")
  return(rpk_counts)
}

Function to calculate redundancy

setwd(wd)

redundancy <- function(blastData, length_filt, var_gene_lengths, identity_filt=97){
#   blastData <-CS2_processed_contigs
  #first filter out unmatched contigs and convert types
  blastData <- data.frame(subset(blastData, sequence.id != '-'))
  blastData$alignment.length <- as.numeric(as.character(blastData$alignment.length))
  blastData$s..start <- as.numeric(as.character(blastData$s..start))
  blastData$s..end <- as.numeric(as.character(blastData$s..end))
  blastData$q..start <- as.numeric(as.character(blastData$q..start))
  blastData$q..end <- as.numeric(as.character(blastData$q..end))
  blastData$percent.identity <- as.numeric(as.character(blastData$percent.identity))
  blastData$alignment.length <- as.numeric(as.character(blastData$alignment.length))
  blastData$bit.score <- as.numeric(as.character(blastData$bit.score))
  
  #now split into seperate data frames for each database
  var_hits <- subset(blastData, grepl("blastHits_Full_var_rask2010*",data.base.name))
  var_hits <- subset(var_hits, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
  var_hits <- subset(var_hits, percent.identity > identity_filt) #throw away alignments that share less than x%
  
  #var_hits <-subset(var_hits, (sequence.id == 'IT4var04') | (sequence.id == 'IT4var08'))
  human_hits <- subset(blastData, grepl("blastHits_human_reference*",data.base.name))
  Pf3D7_hits <- subset(blastData, grepl("blastHits_Pf3D7_reference*",data.base.name))
  Pf3D7_hits <- subset(Pf3D7_hits, alignment.length > 100) #throw away contigs less than 3 reads ~ length residues long
  Pf3D7_hits <- subset(Pf3D7_hits, percent.identity > 85) #throw away alignments that share less than x%
  
  #filter out human and Pf3D7 contigs that aren't matched to any var genes
  var_contigs <- unique(var_hits$contig.id)
  human_hits <- subset(human_hits, contig.id %in% var_contigs)
  Pf3D7_hits <- subset(Pf3D7_hits, contig.id %in% var_contigs)
  
  #sort var hit by name then bit score
  var_hits <- var_hits[with(var_hits, order(contig.id, bit.score, decreasing=TRUE)), ]
  var_hits <- droplevels(var_hits)
  #remove duplicated contigs which have overlapping query regions.
  
  VH <- as.data.table(var_hits)
  VH$q..start <- VH$q..start+250
  VH$q..end <- VH$q..end-250
  VH[,group := { 
      ir <-  IRanges(q..start, q..end);
      subjectHits(findOverlaps(ir, reduce(ir), minoverlap=0))
      },by=contig.id]
  VH$q..start <- VH$q..start-250
  VH$q..end <- VH$q..end+250
 
  
  
  VH <- data.frame(VH)
  VH <- VH[!duplicated(VH[,colnames(VH) %in% c('contig.id', 'group')]),]
  VH$group <- NULL
  
  total.alignment <- sum(VH$alignment.length)

  gr = GRanges(VH$sequence.id, 
            IRanges(VH$s..start, VH$s..end), 
                strand="*") 
  re <- reduce(gr)
  total.coverage <- sum(width(ranges(re)))
  
  redundancy <- total.alignment/total.coverage
  return(redundancy)
}

Load more data

setwd(wd)

#expression data
var_expressions_mike <- read.csv("/wehisan/home/allstaff/t/tonkin-hill.g/assembly_testing/counts/var_expressions_mike.csv")

#read count data
read_counts_against_IT4_genes <- read.csv("/wehisan/home/allstaff/t/tonkin-hill.g/assembly_testing/counts/read_counts_against_IT4_genes.csv")
CS2_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$CS2, 'counts'=read_counts_against_IT4_genes$counts)
ECS_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$ECS, 'counts'=read_counts_against_IT4_genes$counts.1)
EIC_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$EIC, 'counts'=read_counts_against_IT4_genes$counts.2)

#gene length data
var_gene_lengths <- read.delim("/wehisan/home/allstaff/t/tonkin-hill.g/find_var_genes/assemble_var/data/var_rask_lengths.txt", header=F)
colnames(var_gene_lengths) <- c("sequence.id", "gene.length")
var_gene_lengths <- data.frame(var_gene_lengths)

#rpkm
CS2_read_counts <- merge(CS2_read_counts, var_gene_lengths, by.x="var.name", by.y="sequence.id")
CS2_read_counts$counts <- CS2_read_counts$counts/CS2_read_counts$gene.length
CS2_read_counts$gene.length <- NULL
ECS_read_counts <- merge(ECS_read_counts, var_gene_lengths, by.x="var.name", by.y="sequence.id")
ECS_read_counts$counts <- ECS_read_counts$counts/ECS_read_counts$gene.length
ECS_read_counts$gene.length <- NULL
EIC_read_counts <- merge(EIC_read_counts, var_gene_lengths, by.x="var.name", by.y="sequence.id")
EIC_read_counts$counts <- EIC_read_counts$counts/EIC_read_counts$gene.length
EIC_read_counts$gene.length <- NULL

# #more rpkm
# EIC_processed_contigs <- merge(EIC_processed_contigs, var_gene_lengths, by.x="sequence.id", by.y="sequence.id")
# EIC_processed_contigs$rpk <- EIC_processed_contigs$rpk/EIC_processed_contigs$gene.length
# EIC_processed_contigs$gene.length <- NULL

For the contig plots we restrict the contigs to those that are longer than 1000 residues and share more than 95% similarity with the reference. Any contig that has overlapped with human, falciparum or vivax (non-var) by more than 30% has also been removed. In each of the contig plots below the lighter shaded blocks represent the contigs whilst the dark colouring represents those parts of the contigs that have aligned to the reference. If contigs map to more than one reference gene they can appear more than once.

Lets look at a plot of the contigs for ECS.

setwd(wd)

contig_counts <- plot_contigs(ECS_processed_contigs, 800, 95, var_gene_lengths)

Next we look at the redundancy of this assembly. Redundancy is calculated by first filtering contigs to those that align to VAR with a length of at least 800 and identity of 97. We then only allow each region of each transcript to only map to one reference region. This is done rather harshly in that if two blast hit overlap from on the same region of the transcript we take the one with the high bit score. Reduncdancy is then calculated as \[ \frac{\text{total alignment length}}{\text{total number of nucleotide in reference that are covered by hits}} \]

setwd(wd)

redundancy(ECS_processed_contigs, 800, var_gene_lengths, 95)
## [1] 1

For the expression plots I first aligned all the reads for each sample onto all the IT4var genes. This is reffered to as ‘whole gene count’. I then aligned all the reads for each sample onto the contigs output by our assembly pipeline. This is reffered to as ‘contig count’. Finally expression count are the 2exp-deltadeltaCt numbers from the quantification assay. All these counts are normalised and then plotted on a log10 scale. Thus each point on the y-axis can be interpreted as 0.1% of the total for that count type and that sample.

plot expression for ECS

setwd(wd)

ecs_df <- merge(var_expressions_mike, ECS_read_counts, by='var.name')
ecs_df <- ecs_df[,!(names(ecs_df) %in% c('CS2','EIC'))]
ecs_df$counts <- ecs_df$counts/sum(ecs_df$counts)*1000000
ecs_df$ECS <- ecs_df$ECS
ecs_df <- merge(ecs_df, contig_counts, all.x=TRUE, by='var.name')
ecs_df$contig.count[is.na(ecs_df$contig.count)] <- 0
ecs_df$contig.count <- ecs_df$contig.count/sum(ecs_df$contig.count)*1000000
names(ecs_df) <- c('Rask_name','Expression_name','qPCR expression value','whole gene RPKM','contig RPKM')
ecs_df2 <- melt(ecs_df, id=c('Rask_name','Expression_name'))
ecs <- ggplot(ecs_df2, aes(Rask_name, fill=factor(variable), weight=value)) + geom_bar(position="dodge")
ecs <- ecs + facet_wrap(~ variable, scales="free_y",  ncol = 1)
ecs <- ecs + mybartheme()
ecs <- ecs + ylab("Expression profile (RPKM/qPCR)") + xlab("Gene Name")
ecs

Lets look at a plot of the contigs for CS2

setwd(wd)

contig_counts <- plot_contigs(CS2_processed_contigs, 800, 95, var_gene_lengths)

Redundancy

setwd(wd)

redundancy(CS2_processed_contigs, 800, var_gene_lengths, 95)
## [1] 1

plot expression for CS2

setwd(wd)

cs2_df <- merge(var_expressions_mike, CS2_read_counts, by='var.name')
cs2_df <- cs2_df[,!(names(cs2_df) %in% c('ECS','EIC'))]
cs2_df$counts <- cs2_df$counts/sum(cs2_df$counts)*1000000
cs2_df$CS2 <- cs2_df$CS2
cs2_df <- merge(cs2_df, contig_counts, all.x=TRUE, by='var.name')
cs2_df$contig.count[is.na(cs2_df$contig.count)] <- 0
cs2_df$contig.count <- cs2_df$contig.count/sum(cs2_df$contig.count)*1000000
names(cs2_df) <- c('Rask_name','Expression_name','qPCR expression value','whole gene RPKM','contig RPKM')
cs2_df2 <- melt(cs2_df, id=c('Rask_name','Expression_name'))
cs2 <- ggplot(cs2_df2, aes(Rask_name, fill=factor(variable), weight=value)) + geom_bar(position="dodge")
cs2 <- cs2 + scale_y_continuous(limits=c(0,NA))
cs2 <- cs2 + facet_wrap(~ variable, scales="free_y",  ncol = 1)
cs2 <- cs2 + mybartheme()
cs2 <- cs2 + ylab("Expression profile (RPKM/qPCR)") + xlab("Gene Name")
cs2

Lets look at a plot of the contigs for EIC As their was a higher rate of similarities in EIC I’ve upped the similarity threshold to 97%

setwd(wd)

contig_counts <- plot_contigs(EIC_processed_contigs, 500, 95, var_gene_lengths)

Redundancy

setwd(wd)

redundancy(EIC_processed_contigs, 500, var_gene_lengths, 95)
## [1] 1.004301

plot expression for EIC

setwd(wd)

eic_df <- merge(var_expressions_mike, EIC_read_counts, by='var.name')
eic_df <- eic_df[,!(names(eic_df) %in% c('ECS','CS2'))]
eic_df$counts <- eic_df$counts/sum(eic_df$counts)*1000000
eic_df$EIC <- eic_df$EIC
eic_df <- merge(eic_df, contig_counts, all.x=TRUE, by='var.name')
eic_df$contig.count[is.na(eic_df$contig.count)] <- 0
eic_df$contig.count <- eic_df$contig.count/sum(eic_df$contig.count)*1000000
names(eic_df) <- c('Rask_name','Expression_name','qPCR expression value','whole gene RPKM','contig RPKM')
eic_df2 <- melt(eic_df, id=c('Rask_name','Expression_name'))
eic <- ggplot(eic_df2, aes(Rask_name, fill=factor(variable), weight=value)) + geom_bar(position="dodge")
eic <- eic + scale_y_continuous(limits=c(0,NA))
eic <- eic + mybartheme()
eic <- eic + facet_wrap(~ variable, scales="free_y",  ncol = 1)
eic <- eic + ylab("Expression profile (RPKM/qPCR)") + xlab("Gene Name")
eic

Now we calculate some summary statistics for this comparison. First the number of IT4var genes found and the total found by qPCR above 1e-3.

setwd(wd)

countQPCRExpressed <- sum(eic_df$'qPCR expression value'>1e-3)
countContigExpressed <- sum(eic_df$'contig RPKM'>0)

A total of 43 were found above a threshold of \(1e-3\) by qPCR. After assembly a total of 40 genes were identified.

Now we look at the overall similarity of coverage. This is rather sensitive to single genes with large differences in exression profiles between the methods.

setwd(wd)

qPCR <- eic_df$'qPCR expression value'
qPCR[qPCR<1e-3] <- 0
contig <- eic_df$'contig RPKM'

The Spearman correlation in expression profiles is 0.6538091

The Pearson correlation in expression is 0.8044045

warnings()
## NULL